{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 17,
      "id": "81acb470",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "from dotenv import load_dotenv\n",
        "from openai import OpenAI\n",
        "from rich import print\n",
        "\n",
        "load_dotenv()\n",
        "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
        "\n",
        "client = OpenAI(api_key=OPENAI_API_KEY)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "id": "b5532458",
      "metadata": {},
      "outputs": [],
      "source": [
        "from openai import OpenAI\n",
        "client = OpenAI()\n",
        "\n",
        "batch_input_file = client.files.create(\n",
        "  file=open(\"batchinput.jsonl\", \"rb\"),\n",
        "  purpose=\"batch\"\n",
        ")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "id": "41e62fa8",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">FileObject</span><span style=\"font-weight: bold\">(</span>\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'file-5UhXBkBba7yLBUhEALqUQea3'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">bytes</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4758</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">created_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847544</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">filename</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batchinput.jsonl'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">object</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'file'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">purpose</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batch'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">status</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'processed'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">status_details</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>\n",
              "<span style=\"font-weight: bold\">)</span>\n",
              "</pre>\n"
            ],
            "text/plain": [
              "\u001b[1;35mFileObject\u001b[0m\u001b[1m(\u001b[0m\n",
              "    \u001b[33mid\u001b[0m=\u001b[32m'file-5UhXBkBba7yLBUhEALqUQea3'\u001b[0m,\n",
              "    \u001b[33mbytes\u001b[0m=\u001b[1;36m4758\u001b[0m,\n",
              "    \u001b[33mcreated_at\u001b[0m=\u001b[1;36m1726847544\u001b[0m,\n",
              "    \u001b[33mfilename\u001b[0m=\u001b[32m'batchinput.jsonl'\u001b[0m,\n",
              "    \u001b[33mobject\u001b[0m=\u001b[32m'file'\u001b[0m,\n",
              "    \u001b[33mpurpose\u001b[0m=\u001b[32m'batch'\u001b[0m,\n",
              "    \u001b[33mstatus\u001b[0m=\u001b[32m'processed'\u001b[0m,\n",
              "    \u001b[33mstatus_details\u001b[0m=\u001b[3;35mNone\u001b[0m\n",
              "\u001b[1m)\u001b[0m\n"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "print(batch_input_file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "id": "cfb522cd",
      "metadata": {},
      "outputs": [],
      "source": [
        "batch_input_file_id = batch_input_file.id\n",
        "\n",
        "batch = client.batches.create(\n",
        "    input_file_id=batch_input_file_id,\n",
        "    endpoint=\"/v1/embeddings\",\n",
        "    completion_window=\"24h\",\n",
        "    metadata={\n",
        "      \"description\": \"nightly eval job\"\n",
        "    }\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "id": "c1578a92",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Batch</span><span style=\"font-weight: bold\">(</span>\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batch_katOshqMSik1DI8qwXtuHaUj'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">completion_window</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'24h'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">created_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847573</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">endpoint</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'/v1/embeddings'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">input_file_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'file-5UhXBkBba7yLBUhEALqUQea3'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">object</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batch'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">status</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'validating'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">cancelled_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">cancelling_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">error_file_id</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">errors</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">expired_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">expires_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726933973</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">failed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">finalizing_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">in_progress_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'description'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'nightly eval job'</span><span style=\"font-weight: bold\">}</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">output_file_id</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">request_counts</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">BatchRequestCounts</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">completed</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, <span style=\"color: #808000; text-decoration-color: #808000\">failed</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, <span style=\"color: #808000; text-decoration-color: #808000\">total</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span><span style=\"font-weight: bold\">)</span>\n",
              "<span style=\"font-weight: bold\">)</span>\n",
              "</pre>\n"
            ],
            "text/plain": [
              "\u001b[1;35mBatch\u001b[0m\u001b[1m(\u001b[0m\n",
              "    \u001b[33mid\u001b[0m=\u001b[32m'batch_katOshqMSik1DI8qwXtuHaUj'\u001b[0m,\n",
              "    \u001b[33mcompletion_window\u001b[0m=\u001b[32m'24h'\u001b[0m,\n",
              "    \u001b[33mcreated_at\u001b[0m=\u001b[1;36m1726847573\u001b[0m,\n",
              "    \u001b[33mendpoint\u001b[0m=\u001b[32m'/v1/embeddings'\u001b[0m,\n",
              "    \u001b[33minput_file_id\u001b[0m=\u001b[32m'file-5UhXBkBba7yLBUhEALqUQea3'\u001b[0m,\n",
              "    \u001b[33mobject\u001b[0m=\u001b[32m'batch'\u001b[0m,\n",
              "    \u001b[33mstatus\u001b[0m=\u001b[32m'validating'\u001b[0m,\n",
              "    \u001b[33mcancelled_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mcancelling_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mcompleted_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33merror_file_id\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33merrors\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mexpired_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mexpires_at\u001b[0m=\u001b[1;36m1726933973\u001b[0m,\n",
              "    \u001b[33mfailed_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mfinalizing_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33min_progress_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'description'\u001b[0m: \u001b[32m'nightly eval job'\u001b[0m\u001b[1m}\u001b[0m,\n",
              "    \u001b[33moutput_file_id\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mrequest_counts\u001b[0m=\u001b[1;35mBatchRequestCounts\u001b[0m\u001b[1m(\u001b[0m\u001b[33mcompleted\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mfailed\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtotal\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1m)\u001b[0m\n",
              "\u001b[1m)\u001b[0m\n"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "print(batch)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "id": "5e5df551",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Batch</span><span style=\"font-weight: bold\">(</span>\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batch_katOshqMSik1DI8qwXtuHaUj'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">completion_window</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'24h'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">created_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847573</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">endpoint</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'/v1/embeddings'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">input_file_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'file-5UhXBkBba7yLBUhEALqUQea3'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">object</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'batch'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">status</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'completed'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">cancelled_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">cancelling_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847576</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">error_file_id</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">errors</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">expired_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">expires_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726933973</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">failed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">finalizing_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847576</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">in_progress_at</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1726847574</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'description'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'nightly eval job'</span><span style=\"font-weight: bold\">}</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">output_file_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'file-fWmorUHxdNWkFGNLreI8JCCE'</span>,\n",
              "    <span style=\"color: #808000; text-decoration-color: #808000\">request_counts</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">BatchRequestCounts</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">completed</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>, <span style=\"color: #808000; text-decoration-color: #808000\">failed</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>, <span style=\"color: #808000; text-decoration-color: #808000\">total</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span><span style=\"font-weight: bold\">)</span>\n",
              "<span style=\"font-weight: bold\">)</span>\n",
              "</pre>\n"
            ],
            "text/plain": [
              "\u001b[1;35mBatch\u001b[0m\u001b[1m(\u001b[0m\n",
              "    \u001b[33mid\u001b[0m=\u001b[32m'batch_katOshqMSik1DI8qwXtuHaUj'\u001b[0m,\n",
              "    \u001b[33mcompletion_window\u001b[0m=\u001b[32m'24h'\u001b[0m,\n",
              "    \u001b[33mcreated_at\u001b[0m=\u001b[1;36m1726847573\u001b[0m,\n",
              "    \u001b[33mendpoint\u001b[0m=\u001b[32m'/v1/embeddings'\u001b[0m,\n",
              "    \u001b[33minput_file_id\u001b[0m=\u001b[32m'file-5UhXBkBba7yLBUhEALqUQea3'\u001b[0m,\n",
              "    \u001b[33mobject\u001b[0m=\u001b[32m'batch'\u001b[0m,\n",
              "    \u001b[33mstatus\u001b[0m=\u001b[32m'completed'\u001b[0m,\n",
              "    \u001b[33mcancelled_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mcancelling_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mcompleted_at\u001b[0m=\u001b[1;36m1726847576\u001b[0m,\n",
              "    \u001b[33merror_file_id\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33merrors\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mexpired_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mexpires_at\u001b[0m=\u001b[1;36m1726933973\u001b[0m,\n",
              "    \u001b[33mfailed_at\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
              "    \u001b[33mfinalizing_at\u001b[0m=\u001b[1;36m1726847576\u001b[0m,\n",
              "    \u001b[33min_progress_at\u001b[0m=\u001b[1;36m1726847574\u001b[0m,\n",
              "    \u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'description'\u001b[0m: \u001b[32m'nightly eval job'\u001b[0m\u001b[1m}\u001b[0m,\n",
              "    \u001b[33moutput_file_id\u001b[0m=\u001b[32m'file-fWmorUHxdNWkFGNLreI8JCCE'\u001b[0m,\n",
              "    \u001b[33mrequest_counts\u001b[0m=\u001b[1;35mBatchRequestCounts\u001b[0m\u001b[1m(\u001b[0m\u001b[33mcompleted\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mfailed\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtotal\u001b[0m=\u001b[1;36m2\u001b[0m\u001b[1m)\u001b[0m\n",
              "\u001b[1m)\u001b[0m\n"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "from openai import OpenAI\n",
        "client = OpenAI()\n",
        "\n",
        "batch_status = client.batches.retrieve(batch.id)\n",
        "print(batch_status)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "id": "02726a51",
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/tmp/ipykernel_281948/1607969247.py:2: DeprecationWarning: Due to a bug, this method doesn't actually stream the response content, `.with_streaming_response.method()` should be used instead\n",
            "  file_response.astream_to_file(\"result.json\")\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "<coroutine object HttpxBinaryResponseContent.astream_to_file at 0x762f9c6e1070>"
            ]
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "file_response = client.files.content(batch_status.output_file_id)\n",
        "file_response.astream_to_file(\"result.json\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "22fb9e0a-566b-4f34-b9cf-72193cb51adb",
      "metadata": {
        "id": "22fb9e0a-566b-4f34-b9cf-72193cb51adb"
      },
      "source": [
        "## OpenAI\n",
        "\n",
        "You will need an [OpenAI](https://openai.com/) api key for this tutorial. Login to your [platform.openai.com](https://platform.openai.com/) account, click on your profile picture in the upper right corner, and choose 'API Keys' from the menu. Create an API key for this tutorial and save it. You will need it below."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "PWMbn7GooMm5",
      "metadata": {
        "id": "PWMbn7GooMm5"
      },
      "source": [
        "Set your OpenAI api key, and Pinecone api key and environment in the file we created."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "QOyfIoXAoVGX",
      "metadata": {
        "id": "QOyfIoXAoVGX"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "from dotenv import load_dotenv\n",
        "\n",
        "load_dotenv()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "493cfa6d",
      "metadata": {},
      "source": [
        "## Setting up Vector Database\n",
        "\n",
        "We will be using qDrant as the Vector database\n",
        "There are 4 ways to initialize qdrant \n",
        "\n",
        "1. Inmemory\n",
        "```python\n",
        "client = qdrant_client.QdrantClient(location=\":memory:\")\n",
        "```\n",
        "2. Disk\n",
        "```python\n",
        "client = qdrant_client.QdrantClient(path=\"./data\")\n",
        "```\n",
        "3. Self hosted or Docker\n",
        "```python\n",
        "\n",
        "client = qdrant_client.QdrantClient(\n",
        "    # url=\"http://<host>:<port>\"\n",
        "    host=\"localhost\",port=6333\n",
        ")\n",
        "```\n",
        "\n",
        "4. Qdrant cloud\n",
        "```python\n",
        "client = qdrant_client.QdrantClient(\n",
        "    url=QDRANT_CLOUD_ENDPOINT,\n",
        "    api_key=QDRANT_API_KEY,\n",
        ")\n",
        "```\n",
        "\n",
        "for this notebook we will be using qdrant cloud"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0f2746a9",
      "metadata": {},
      "outputs": [],
      "source": [
        "import qdrant_client\n",
        "\n",
        "# LlamaIndex core imports\n",
        "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
        "from llama_index.core import Settings\n",
        "\n",
        "# LlamaIndex vector store import\n",
        "from llama_index.vector_stores.qdrant import QdrantVectorStore\n",
        "\n",
        "# creating a qdrant client instance\n",
        "\n",
        "client = qdrant_client.QdrantClient(\n",
        "    # you can use :memory: mode for fast and light-weight experiments,\n",
        "    # it does not require to have Qdrant deployed anywhere\n",
        "    # but requires qdrant-client >= 1.1.1\n",
        "    # location=\":memory:\"\n",
        "    # otherwise set Qdrant instance address with:\n",
        "    # url=QDRANT_CLOUD_ENDPOINT,\n",
        "    # otherwise set Qdrant instance with host and port:\n",
        "    host=\"localhost\",\n",
        "    port=6333\n",
        "    # set API KEY for Qdrant Cloud\n",
        "    # api_key=QDRANT_API_KEY,\n",
        "    # path=\"./db/\"\n",
        ")\n",
        "\n",
        "vector_store = QdrantVectorStore(client=client, collection_name=\"01_Data_Ingestion\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "be9437a0-3d52-4586-8217-43944971a2cc",
      "metadata": {
        "id": "be9437a0-3d52-4586-8217-43944971a2cc"
      },
      "source": [
        "## Build an Ingestion Pipeline from Scratch\n",
        "\n",
        "We show how to build an ingestion pipeline as mentioned in the introduction.\n",
        "\n",
        "Note that steps (2) and (3) can be handled via our `NodeParser` abstractions, which handle splitting and node creation.\n",
        "\n",
        "For the purposes of this tutorial, we show you how to create these objects manually."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6d1c9630-2a6b-4656-b272-de1b869c8977",
      "metadata": {
        "id": "6d1c9630-2a6b-4656-b272-de1b869c8977"
      },
      "source": [
        "### 1. Load Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "48739cfc-c05a-420a-8c78-280892f8d7a0",
      "metadata": {
        "id": "48739cfc-c05a-420a-8c78-280892f8d7a0",
        "outputId": "cf6c21a1-d0a7-4b4b-dc40-93ce99824100"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "--2023-10-13 01:45:14--  https://arxiv.org/pdf/2307.09288.pdf\n",
            "Resolving arxiv.org (arxiv.org)... 128.84.21.199\n",
            "Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 13661300 (13M) [application/pdf]\n",
            "Saving to: ‘data/llama2.pdf’\n",
            "\n",
            "data/llama2.pdf     100%[===================>]  13.03M  7.59MB/s    in 1.7s    \n",
            "\n",
            "2023-10-13 01:45:16 (7.59 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
          ]
        }
      ],
      "source": [
        "!mkdir data\n",
        "!wget --user-agent \"Mozilla\" \"https://arxiv.org/pdf/2307.09288.pdf\" -O \"../data/llama2.pdf\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "079666c5-0685-413d-a765-17f71ae89506",
      "metadata": {
        "id": "079666c5-0685-413d-a765-17f71ae89506"
      },
      "outputs": [],
      "source": [
        "import fitz"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4eee7692-2188-4552-9f2e-cb90ac6b7678",
      "metadata": {
        "id": "4eee7692-2188-4552-9f2e-cb90ac6b7678"
      },
      "outputs": [],
      "source": [
        "file_path = \"../data/llama2.pdf\"\n",
        "doc = fitz.open(file_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "74c573db-1863-45c3-9049-8bad535e6e35",
      "metadata": {
        "id": "74c573db-1863-45c3-9049-8bad535e6e35"
      },
      "source": [
        "### 2. Use a Text Splitter to Split Documents\n",
        "\n",
        "Here we import our `SentenceSplitter` to split document texts into smaller chunks, while preserving paragraphs/sentences as much as possible."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9e175007-84d5-406e-bf5f-6ecacfbfd152",
      "metadata": {
        "id": "9e175007-84d5-406e-bf5f-6ecacfbfd152"
      },
      "outputs": [],
      "source": [
        "from llama_index.core.node_parser import SentenceSplitter"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0dbccb26-ea2a-48c9-adb4-1ebe88adaa1c",
      "metadata": {
        "id": "0dbccb26-ea2a-48c9-adb4-1ebe88adaa1c"
      },
      "outputs": [],
      "source": [
        "text_parser = SentenceSplitter(\n",
        "    chunk_size=1024,\n",
        "    # separator=\" \",\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7a9bed96-adfa-40c9-92bd-9dba68d58730",
      "metadata": {
        "id": "7a9bed96-adfa-40c9-92bd-9dba68d58730"
      },
      "outputs": [],
      "source": [
        "text_chunks = []\n",
        "# maintain relationship with source doc index, to help inject doc metadata in (3)\n",
        "doc_idxs = []\n",
        "for doc_idx, page in enumerate(doc):\n",
        "    page_text = page.get_text(\"text\")\n",
        "    cur_text_chunks = text_parser.split_text(page_text)\n",
        "    text_chunks.extend(cur_text_chunks)\n",
        "    doc_idxs.extend([doc_idx] * len(cur_text_chunks))"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "354157d6-b436-4f0a-bf6e-f0a197e54c60",
      "metadata": {
        "id": "354157d6-b436-4f0a-bf6e-f0a197e54c60"
      },
      "source": [
        "### 3. Manually Construct Nodes from Text Chunks\n",
        "\n",
        "We convert each chunk into a `TextNode` object, a low-level data abstraction in LlamaIndex that stores content but also allows defining metadata + relationships with other Nodes.\n",
        "\n",
        "We inject metadata from the document into each node.\n",
        "\n",
        "This essentially replicates logic in our `SentenceSplitter`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "33b93044-3eb4-4c77-bc40-be53dffd3749",
      "metadata": {
        "id": "33b93044-3eb4-4c77-bc40-be53dffd3749"
      },
      "outputs": [],
      "source": [
        "from llama_index.core.schema import TextNode"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "adbfcb3f-5554-4594-ae80-7236e28485aa",
      "metadata": {
        "id": "adbfcb3f-5554-4594-ae80-7236e28485aa"
      },
      "outputs": [],
      "source": [
        "nodes = []\n",
        "for idx, text_chunk in enumerate(text_chunks):\n",
        "    node = TextNode(\n",
        "        text=text_chunk,\n",
        "    )\n",
        "    src_doc_idx = doc_idxs[idx]\n",
        "    src_page = doc[src_doc_idx]\n",
        "    nodes.append(node)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3iidPVIiwYUg",
      "metadata": {
        "id": "3iidPVIiwYUg"
      },
      "outputs": [],
      "source": [
        "print(nodes[0].metadata)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "257bb2e3-608a-4542-ba29-f29b59771a3f",
      "metadata": {
        "id": "257bb2e3-608a-4542-ba29-f29b59771a3f"
      },
      "outputs": [],
      "source": [
        "# print a sample node\n",
        "print(nodes[0].get_content(metadata_mode=\"all\"))"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "fac468f5-870c-4486-b576-2aa6d9eaf322",
      "metadata": {
        "id": "fac468f5-870c-4486-b576-2aa6d9eaf322"
      },
      "source": [
        "### [Optional] 4. Extract Metadata from each Node\n",
        "\n",
        "We extract metadata from each Node using our Metadata extractors.\n",
        "\n",
        "This will add more metadata to each Node."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9c369188-9cc9-4550-924e-b29d212ad057",
      "metadata": {
        "id": "9c369188-9cc9-4550-924e-b29d212ad057"
      },
      "outputs": [],
      "source": [
        "from llama_index.core.extractors import (\n",
        "    QuestionsAnsweredExtractor,\n",
        "    TitleExtractor,\n",
        ")\n",
        "from llama_index.core.ingestion import IngestionPipeline\n",
        "from llama_index.llms.openai import OpenAI\n",
        "\n",
        "llm = OpenAI(model=\"gpt-3.5-turbo\")\n",
        "\n",
        "extractors = [\n",
        "    TitleExtractor(nodes=5, llm=llm),\n",
        "    QuestionsAnsweredExtractor(questions=3, llm=llm),\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f5501ffc-9bbb-4b48-9181-4e4e371e8f41",
      "metadata": {
        "id": "f5501ffc-9bbb-4b48-9181-4e4e371e8f41"
      },
      "outputs": [],
      "source": [
        "pipeline = IngestionPipeline(\n",
        "    transformations=extractors,\n",
        ")\n",
        "nodes = await pipeline.arun(nodes=nodes, in_place=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "WgbKmXr3ytPf",
      "metadata": {
        "id": "WgbKmXr3ytPf"
      },
      "outputs": [],
      "source": [
        "print(nodes[0].metadata)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "9d52522c-ffee-49d1-8651-658d70248053",
      "metadata": {
        "id": "9d52522c-ffee-49d1-8651-658d70248053"
      },
      "source": [
        "### 5. Generate Embeddings for each Node\n",
        "\n",
        "Generate document embeddings for each Node using our OpenAI embedding model (`text-embedding-ada-002`).\n",
        "\n",
        "Store these on the `embedding` property on each Node."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6e071e36-e609-4a0c-a478-e8cfbe751cff",
      "metadata": {
        "id": "6e071e36-e609-4a0c-a478-e8cfbe751cff"
      },
      "outputs": [],
      "source": [
        "from llama_index.embeddings.openai import OpenAIEmbedding\n",
        "\n",
        "embed_model = OpenAIEmbedding()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2047ca46-729f-4c5a-a8d7-3bc860604333",
      "metadata": {
        "id": "2047ca46-729f-4c5a-a8d7-3bc860604333"
      },
      "outputs": [],
      "source": [
        "for node in nodes:\n",
        "    node_embedding = embed_model.get_text_embedding(\n",
        "        node.get_content(metadata_mode=\"all\")\n",
        "    )\n",
        "    node.embedding = node_embedding"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "11f78014-dcca-43f5-95cb-9cfb5140b30e",
      "metadata": {
        "id": "11f78014-dcca-43f5-95cb-9cfb5140b30e"
      },
      "source": [
        "### 6. Load Nodes into a Vector Store\n",
        "\n",
        "We now insert these nodes into our `PineconeVectorStore`.\n",
        "\n",
        "**NOTE**: We skip the VectorStoreIndex abstraction, which is a higher-level abstraction that handles ingestion as well. We use `VectorStoreIndex` in the next section to fast-track retrieval/querying."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fe34fbe4-3396-402e-8599-4b42013c3016",
      "metadata": {
        "id": "fe34fbe4-3396-402e-8599-4b42013c3016"
      },
      "outputs": [],
      "source": [
        "vector_store.add(nodes)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "74e7f8bd-d92a-40ad-8d9b-a18c04ddfca9",
      "metadata": {
        "id": "74e7f8bd-d92a-40ad-8d9b-a18c04ddfca9"
      },
      "source": [
        "## Retrieve and Query from the Vector Store\n",
        "\n",
        "Now that our ingestion is complete, we can retrieve/query this vector store.\n",
        "\n",
        "**NOTE**: We can use our high-level `VectorStoreIndex` abstraction here. See the next section to see how to define retrieval at a lower-level!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "be6a4fe1-2665-43e6-a872-8e631e31b0fd",
      "metadata": {
        "id": "be6a4fe1-2665-43e6-a872-8e631e31b0fd"
      },
      "outputs": [],
      "source": [
        "from llama_index.core import VectorStoreIndex\n",
        "from llama_index.core import StorageContext"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0e9d2495-d4f7-469a-9cea-a5cfc401c085",
      "metadata": {
        "id": "0e9d2495-d4f7-469a-9cea-a5cfc401c085"
      },
      "outputs": [],
      "source": [
        "index = VectorStoreIndex.from_vector_store(vector_store)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5c89e1c1-8ed1-45f5-b2a4-7c3382195693",
      "metadata": {
        "id": "5c89e1c1-8ed1-45f5-b2a4-7c3382195693"
      },
      "outputs": [],
      "source": [
        "query_engine = index.as_query_engine()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cd6e0ddb-97c9-4f42-8843-a36a29ba3f17",
      "metadata": {
        "id": "cd6e0ddb-97c9-4f42-8843-a36a29ba3f17"
      },
      "outputs": [],
      "source": [
        "query_str = \"Can you tell me about the key concepts for safety finetuning\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "950cae37-7bad-44a3-be51-4154a8630818",
      "metadata": {
        "id": "950cae37-7bad-44a3-be51-4154a8630818"
      },
      "outputs": [],
      "source": [
        "response = query_engine.query(query_str)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c0b309bb-ca5a-4b15-948c-687038361c91",
      "metadata": {
        "id": "c0b309bb-ca5a-4b15-948c-687038361c91"
      },
      "outputs": [],
      "source": [
        "print(str(response))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "basic-rag",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
